#导入模块
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline
#创建特征列表表头
#column_names = ['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class']
column_names = ['id','diagnosis','radius_mean','texture_mean','perimeter_mean','area_mean','smoothness_mean','compactness_mean','concavity_mean','concave points_mean','symmetry_mean','fractal_dimension_mean','radius_se','texture_se','perimeter_se','area_se','smoothness_se','compactness_se','concavity_se','concave points_se','symmetry_se','fractal_dimension_se','radius_worst','texture_worst','perimeter_worst','area_worst','smoothness_worst','compactness_worst','concavity_worst','concave points_worst','symmetry_worst','fractal_dimension_worst']
#使用pandas.read_csv函数从网上读取数据集
#data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data',names=column_names)
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data',names=column_names)
df.info()
2)诊断(M =恶性,B =良性)
3-32为每个细胞核计算十个实值特征:
df.head(10)
sn.countplot(df["diagnosis"],label='count')
# id 不用于classification,删去,diagnosis为分类标准
## feature的名字并不需要了解 because I believe machine learning is awesome :)
# df['diagnosis'] = pd.get_dummies(df['diagnosis'],drop_first=True)
dia = df['diagnosis']
df.drop('id', axis=1,inplace=True)
df.drop('diagnosis',axis=1,inplace=True)
data = df
data.describe()
#构建散布矩阵(scatter matrix)
#通过散布矩阵可以看出在这个数据特征和其它特征中有关联性
pd.plotting.scatter_matrix(data, alpha = 0.3, figsize = (80,60), diagonal = 'kde');
#相似相关性热力图
plt.subplots(figsize=(20, 20))
sn.heatmap(data.corr(), annot=True, linewidths=.5, fmt= '.2f', cmap='coolwarm')